This markdown file is created by Sunah Park for extended lab exercises in the book An Introduction to Statistical Learning with Applications in R (ISLR).
rm(list=ls())
# default r markdown global options in document
knitr::opts_chunk$set(
########## Text results ##########
echo=TRUE,
warning=FALSE, # to preserve warnings in the output
error=FALSE, # to preserve errors in the output
message=FALSE, # to preserve messages
strip.white=TRUE, # to remove the white lines in the beginning or end of a source chunk in the output
########## Cache ##########
cache=TRUE,
########## Plots ##########
fig.path="", # prefix to be used for figure filenames
fig.width=8,
fig.height=6,
dpi=200
)library(ISLR)
library(leaps) # regsubsets() function
library(ggplot2)summary(Hitters) # Hitters data from ISLR library## AtBat Hits HmRun Runs
## Min. : 16.0 Min. : 1 Min. : 0.00 Min. : 0.00
## 1st Qu.:255.2 1st Qu.: 64 1st Qu.: 4.00 1st Qu.: 30.25
## Median :379.5 Median : 96 Median : 8.00 Median : 48.00
## Mean :380.9 Mean :101 Mean :10.77 Mean : 50.91
## 3rd Qu.:512.0 3rd Qu.:137 3rd Qu.:16.00 3rd Qu.: 69.00
## Max. :687.0 Max. :238 Max. :40.00 Max. :130.00
##
## RBI Walks Years CAtBat
## Min. : 0.00 Min. : 0.00 Min. : 1.000 Min. : 19.0
## 1st Qu.: 28.00 1st Qu.: 22.00 1st Qu.: 4.000 1st Qu.: 816.8
## Median : 44.00 Median : 35.00 Median : 6.000 Median : 1928.0
## Mean : 48.03 Mean : 38.74 Mean : 7.444 Mean : 2648.7
## 3rd Qu.: 64.75 3rd Qu.: 53.00 3rd Qu.:11.000 3rd Qu.: 3924.2
## Max. :121.00 Max. :105.00 Max. :24.000 Max. :14053.0
##
## CHits CHmRun CRuns CRBI
## Min. : 4.0 Min. : 0.00 Min. : 1.0 Min. : 0.00
## 1st Qu.: 209.0 1st Qu.: 14.00 1st Qu.: 100.2 1st Qu.: 88.75
## Median : 508.0 Median : 37.50 Median : 247.0 Median : 220.50
## Mean : 717.6 Mean : 69.49 Mean : 358.8 Mean : 330.12
## 3rd Qu.:1059.2 3rd Qu.: 90.00 3rd Qu.: 526.2 3rd Qu.: 426.25
## Max. :4256.0 Max. :548.00 Max. :2165.0 Max. :1659.00
##
## CWalks League Division PutOuts Assists
## Min. : 0.00 A:175 E:157 Min. : 0.0 Min. : 0.0
## 1st Qu.: 67.25 N:147 W:165 1st Qu.: 109.2 1st Qu.: 7.0
## Median : 170.50 Median : 212.0 Median : 39.5
## Mean : 260.24 Mean : 288.9 Mean :106.9
## 3rd Qu.: 339.25 3rd Qu.: 325.0 3rd Qu.:166.0
## Max. :1566.00 Max. :1378.0 Max. :492.0
##
## Errors Salary NewLeague
## Min. : 0.00 Min. : 67.5 A:176
## 1st Qu.: 3.00 1st Qu.: 190.0 N:146
## Median : 6.00 Median : 425.0
## Mean : 8.04 Mean : 535.9
## 3rd Qu.:11.00 3rd Qu.: 750.0
## Max. :32.00 Max. :2460.0
## NA's :59
head(Hitters,3)## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun
## -Andy Allanson 293 66 1 30 29 14 1 293 66 1
## -Alan Ashby 315 81 7 24 38 39 14 3449 835 69
## -Alvin Davis 479 130 18 66 72 76 3 1624 457 63
## CRuns CRBI CWalks League Division PutOuts Assists Errors
## -Andy Allanson 30 29 14 A E 446 33 20
## -Alan Ashby 321 414 375 N W 632 43 10
## -Alvin Davis 224 266 263 A W 880 82 14
## Salary NewLeague
## -Andy Allanson NA A
## -Alan Ashby 475 N
## -Alvin Davis 480 A
Hitters<-na.omit(Hitters)
dim(Hitters)## [1] 263 20
regfit.full<-regsubsets(Salary~., data=Hitters) # from leaps library
summary(regfit.full)## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " "
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*"
## CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) "*" " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 5 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 6 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " " " "*" "*" " " " " " "
## 8 ( 1 ) " " "*" " " "*" "*" " " " " " "
nvmax<-19 # 20-1=19 maximum predictors
regfit.full<-regsubsets(Salary~., data=Hitters, nvmax=nvmax) #nvmax can be used in order to return as many variables as are desired (=maximum size of subsets to examine)
summary(regfit.full)## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = nvmax)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " "
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) "*" " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 5 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 6 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " " " "*" "*" " " " " " "
## 8 ( 1 ) " " "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
names(summary(regfit.full))## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
The regsubsets() function performs best subset selection by identifying the best model that contains a given number of predictors, where best is quantified using RSS. An asterisk indicates that a given variable is included in the corresponding model. For instance, this output indicates that the best two-variable model contains only Hits and CRBI. By default, regsubsets() only reports results up to the best eight-variable model. But the nvmax option can be used in order to return as many variables as are desired.
The summary() function returns R^2, RSS, adjusted R^2, Cp and BIC. We can examine these to try to select the best overall model. For instance, we see that the R^2 statistic increases from 32%, when only one variable is included in the model, to almost 55% when all variables are included. As expected, the R^2 statistic increases monotonically as more variables are included.
df_full<-data.frame(i=seq(1:nvmax),
rsq=summary(regfit.full)$rsq,
rss=summary(regfit.full)$rss,
adjr2=summary(regfit.full)$adjr2,
cp=summary(regfit.full)$cp,
bic=summary(regfit.full)$bic)
# R-squared
ggplot(data=df_full, aes(x=i, y=rsq))+geom_line()+geom_point(pch=4,col="red")+coord_cartesian(x=c(0,nvmax))+labs(x="Number of Predictors", y="R-squared")plot(regfit.full, scale="r2")# Residual Sum of Squares
ggplot(data=df_full, aes(x=i, y=rss))+geom_line()+geom_point(pch=4,col="red")+coord_cartesian(x=c(0,nvmax))+labs(x="Number of Predictors", y="Residual Sum of Squares")# adjusted R-squared
ggplot(data=df_full, aes(x=i, y=adjr2))+geom_line()+geom_point(pch=4,col="red")+coord_cartesian(x=c(0,nvmax))+labs(x="Number of Predictors", y="Adjusted R-Squared")plot(regfit.full, scale="adjr2")# cp
ggplot(data=df_full, aes(x=i, y=cp))+geom_line()+geom_point(pch=4,col="red")+coord_cartesian(x=c(0,nvmax))+labs(x="Number of Predictors", y="Mallows' Cp")plot(regfit.full, scale="Cp")# bic
ggplot(data=df_full, aes(x=i, y=bic))+geom_line()+geom_point(pch=4,col="red")+coord_cartesian(x=c(0,nvmax))+labs(x="Number of Predictors", y="BIC")plot(regfit.full, scale="bic")coef(regfit.full,6)## (Intercept) AtBat Hits Walks CRBI
## 91.5117981 -1.8685892 7.6043976 3.6976468 0.6430169
## DivisionW PutOuts
## -122.9515338 0.2643076
The regsubsets() function has a built-in plot() command which can be used to display the selected variables for the best model with a given number of predictors, ranked according to the BIC, Cp, adjusted R^2, or AIC (?plot.regsubsets). The top row of each plot contains a black square for each variable selected according to the optimal model associated with that statistic.
We can also use the regsubsets() function to perform forward stepwise or backward stepwise selection, using the argument method=“forward” or method=“backward”. Backward selection requires that the number of samples n is larger than the number of variables p (so that the full model can be fit). In contrast, forward stepwise can be used even when n is smaller than p, and so is the only viable subset method when p is very large.
Forward stepwise selection is computationally efficient alternative to best subset selection. While the best subset selection procedure considers all 2^p possible models containing subsets of the p predictors, forward stepwise considers a much smaller set of models. Forward stepwise selection begins with a model containing no predictors, and then adds predictors to the model, one-at-time, until all of the predictors are in the model. In particular, at each step the variable that gives the greatest additional improvement to the fit is added to the model.
regfit.fwd<-regsubsets(Salary~., data=Hitters, nvmax=nvmax, method="forward") # from leaps library
summary(regfit.fwd)## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = nvmax,
## method = "forward")
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " "
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) "*" " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 5 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 6 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 7 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
df_fwd<-data.frame(i=seq(1:nvmax),
rsq=summary(regfit.fwd)$rsq,
rss=summary(regfit.fwd)$rss,
adjr2=summary(regfit.fwd)$adjr2,
cp=summary(regfit.fwd)$cp,
bic=summary(regfit.fwd)$bic)
# R-squared
ggplot(data=df_fwd, aes(x=i, y=rsq))+geom_line()+geom_point(pch=4,col="red")+coord_cartesian(x=c(0,nvmax))+labs(x="Number of Predictors", y="R-squared")plot(regfit.fwd, scale="r2")# Residual Sum of Squares
ggplot(data=df_fwd, aes(x=i, y=rss))+geom_line()+geom_point(pch=4,col="red")+coord_cartesian(x=c(0,nvmax))+labs(x="Number of Predictors", y="Residual Sum of Squares")# adjusted R-squared
ggplot(data=df_fwd, aes(x=i, y=adjr2))+geom_line()+geom_point(pch=4,col="red")+coord_cartesian(x=c(0,nvmax))+labs(x="Number of Predictors", y="Adjusted R-Squared")plot(regfit.fwd, scale="adjr2")# cp
ggplot(data=df_fwd, aes(x=i, y=cp))+geom_line()+geom_point(pch=4,col="red")+coord_cartesian(x=c(0,nvmax))+labs(x="Number of Predictors", y="Mallows' Cp")plot(regfit.fwd, scale="Cp")# bic
ggplot(data=df_fwd, aes(x=i, y=bic))+geom_line()+geom_point(pch=4,col="red")+coord_cartesian(x=c(0,nvmax))+labs(x="Number of Predictors", y="BIC")plot(regfit.fwd, scale="bic")Backward stepwise selection provides an efficient alternative to best subset selection. However, unlike forward stepwise selection, it begins with the full least squares model containing all p predictors, and then iteratively removes the least useful predictor, one-at-a-time.
regfit.bwd<-regsubsets(Salary~., data=Hitters, nvmax=nvmax, method="backward") # from leaps library
summary(regfit.bwd)## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = nvmax,
## method = "backward")
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: backward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*"
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*"
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " " " " " "*" " " " " " "
## 5 ( 1 ) " " " " " " " " "*" " " " " " "
## 6 ( 1 ) " " " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
df_bwd<-data.frame(i=seq(1:nvmax),
rsq=summary(regfit.bwd)$rsq,
rss=summary(regfit.bwd)$rss,
adjr2=summary(regfit.bwd)$adjr2,
cp=summary(regfit.bwd)$cp,
bic=summary(regfit.bwd)$bic)
# R-squared
ggplot(data=df_bwd, aes(x=i, y=rsq))+geom_line()+geom_point(pch=4,col="red")+coord_cartesian(x=c(0,nvmax))+labs(x="Number of Predictors", y="R-squared")plot(regfit.bwd, scale="r2")# Residual Sum of Squares
ggplot(data=df_bwd, aes(x=i, y=rss))+geom_line()+geom_point(pch=4,col="red")+coord_cartesian(x=c(0,nvmax))+labs(x="Number of Predictors", y="Residual Sum of Squares")# adjusted R-squared
ggplot(data=df_bwd, aes(x=i, y=adjr2))+geom_line()+geom_point(pch=4,col="red")+coord_cartesian(x=c(0,nvmax))+labs(x="Number of Predictors", y="Adjusted R-Squared")plot(regfit.bwd, scale="adjr2")# cp
ggplot(data=df_bwd, aes(x=i, y=cp))+geom_line()+geom_point(pch=4,col="red")+coord_cartesian(x=c(0,nvmax))+labs(x="Number of Predictors", y="Mallows' Cp")plot(regfit.bwd, scale="Cp")# bic
ggplot(data=df_bwd, aes(x=i, y=bic))+geom_line()+geom_point(pch=4,col="red")+coord_cartesian(x=c(0,nvmax))+labs(x="Number of Predictors", y="BIC")plot(regfit.bwd, scale="bic")To yield accurate estimates for the test error, we must use only the training observations to perform all aspects of model-fitting-including variable selection. Therefore, the determination of which model of a given size is best must be made using only the training observations. If the full data set is used to perform the best subset selection step, the validaiton set errors and cross-validation errors that we obtain will not be accurate estimates of the test error.
set.seed(1)
train<-sample(c(T,F), nrow(Hitters),rep=TRUE)
test<-!train
regfit.best<-regsubsets(Salary~., data=Hitters[train,], nvmax=19) # regsubsets() to the training set in order to perfor best subset selection
test.mat<-model.matrix(Salary~., data=Hitters[test,])model.matrix() function is used in many regression packages for building an “X” matrix from data. Now we run a loop, and for each size i, we extract the coefficients from regfit.best for the best model of that size, multiply them into the appropriate columns of the test model matrix to form the predictions, and compute the test MSE.
regfit.best<-regsubsets(Salary~., data=Hitters, nvmax=19)
val.errors<-rep(NA,19)
for (i in 1:19) {
coefi<-coef(regfit.best, id=i)
pred<-test.mat[, names(coefi)]%*%coefi
val.errors[i]<-mean((Hitters$Salary[test]-pred)^2)
}
coef(regfit.best, which.min(val.errors))## (Intercept) AtBat Hits Walks CAtBat
## 162.5354420 -2.1686501 6.9180175 5.7732246 -0.1300798
## CRuns CRBI CWalks DivisionW PutOuts
## 1.4082490 0.7743122 -0.8308264 -112.3800575 0.2973726
## Assists
## 0.2831680
predict.regsubsets<-function(object, newdata, id, ...) {
form<-as.formula(object$call[[2]])
mat<-model.matrix(form, newdata)
coefi<-coef(object, id=id)
xvars<-names(coef)
mat[, xvars]%*%coefi
}We now try to choose among the models of different sizes using cross-validation. This approach is somewhat involved, as we must perform best subset selection within each of the k training sets. First, we create a vector that allocates each observaton to one of k=10 folds, and we create a matrix in which we will store the results.
k<-10
set.seed(1)
folds<-sample(1:k, nrow(Hitters), replace=TRUE)
cv.errors<-matrix(NA, k,19, dimnames=list(NULL, paste(1:19)))Now we write a for loop that performs cross-validation. In the jth fold, the elements of folds that equal j are in the test set, and the remainder are in the training set. We make our predicctions for each model size, compute the test errors on the appropriate subset, and store them in the appropriate slot in the matrix cv.errors.
for (j in 1:k) {
best.fit<-regsubsets(Salary~., data=Hitters[folds!=j,], nvmax=19)
for (i in 1:19) {
pred<-predict(best.fit, Hitters[folds==j,], id=i)
cv.errors[j,i]<-mean((Hitters$Salary[folds==j]-pred)^2)
}
}This has given us a 10*19 matrix, of which the (i,j)th element corresponds to the test MSE for the ith cross-validation fold for the best j-variable model. We use the apply() function to average over the columns of this matrix in order to obtain a vector for which the jth element is the cross-validation error for the j-variable model.
mean.cv.errors<-apply(cv.errors, 2, mean)
mean.cv.errors## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
## 19
## NaN
We see that cross-validation selects an 11-variable model. We now perform best subset selection on the full data set in order to obtain the 11-variable model.
reg.best<-regsubsets(Salary~., data=Hitters, nvmax=19)
coef(reg.best,11)## (Intercept) AtBat Hits Walks CAtBat
## 135.7512195 -2.1277482 6.9236994 5.6202755 -0.1389914
## CRuns CRBI CWalks LeagueN DivisionW
## 1.4553310 0.7852528 -0.8228559 43.1116152 -111.1460252
## PutOuts Assists
## 0.2894087 0.2688277